Model format for all data from all journals (N = 47353)
| container.title | n |
|---|---|
| Applied and Environmental Microbiology | 8613 |
| Genome Announcements | 6578 |
| Microbiology Resource Announcements | 5691 |
| Journal of Bacteriology | 4656 |
| Journal of Virology | 4577 |
| Journal of Clinical Microbiology | 4369 |
| Antimicrobial Agents and Chemotherapy | 3223 |
| Microbiology Spectrum | 2900 |
| mBio | 2438 |
| Infection and Immunity | 1854 |
| mSystems | 1406 |
| mSphere | 1041 |
| Journal of Microbiology & Biology Education | 7 |
# predict.glm(total_model, type = "terms",)
plot_model(total_model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]"))
Graph is very busy with all 47K observations
see 2nd plot for 5K observations for a smaller group to examine (index 25000-30000)
Residuals look like an amorphous blob as suggested by Abner@CSCAR
simulationOutput <- simulateResiduals(fittedModel = total_model, plot = F)
#there is no alpha parameter in base R plot
residuals(simulationOutput) %>%
plot(main = "Residuals plotted by numerical index", pch = ".")
residuals_hist <-
residuals(simulationOutput) %>% tibble(residual = .) %>%
ggplot(aes(x = residual)) +
geom_histogram(linewidth = 0.25, color = "white", binwidth = 1/100) +
ggtitle("Residuals Plotted for all journals together")
print(residuals_hist)
# residuals(simulationOutput)[25000:30000] %>%
# plot(main = "Residuals for observation indices 25000 - 3000")
KS Test = Two sample Kolmogorov-Smirnov (KS) Test
This function tests the overall uniformity of the simulated residuals in a DHARMa object - Deviation is significant between the expected residuals and the actual observed residuals.
“If the P value is small, conclude that the two groups were sampled from populations with different distributions.” -Prism help page
Dispersion Test
This function performs simulation-based tests for over/underdispersion
Over / underdispersion means that the observed data is more / less dispersed than expected under the fitted model.
Deviation is significant between the observed data and fitted model.
Outlier Test
This function tests if the number of observations outside the simulation envelope are larger or smaller than expected
Methods generate a null expectation, and then test for an excess or lack of outliers. Per default, testOutliers() looks for both, so if you get a significant p-value, you have to check if you have to many or too few outliers.
See Outlier test for distribution of outliers. - Many at 1.0 residual.
plot(simulationOutput, sub = "All data glm(fixed)")
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
plotQQunif(simulationOutput)
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
plotResiduals(simulationOutput)
DHARMa::testOutliers(simulationOutput, type = "bootstrap")
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 538, observations = 47353, p-value <
## 2.2e-16
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.007019091 0.008966169
## sample estimates:
## outlier frequency (expected: 0.00788545604291175 )
## 0.01136148
#smaller model with 2 terms
two_term_glmnb <-function(model_data, model_name) {
total_model <-MASS::glm.nb(is.referenced.by.count~ da_factor + log(age.in.months) +
+ log(age.in.months)*da_factor + log(age.in.months)*da_factor, data = model_data, link = log)
return(total_model)
}
journals <-
nsd_yes_metadata %>%
count(journal_abrev) %>%
filter(journal_abrev != "jmbe")
#j <- 5
for(j in 1:nrow(journals)) {
journal_data <-
nsd_yes_metadata %>%
filter(journal_abrev == journals[[j,1]]) %>%
mutate(da_factor = factor(da))
actual_citations <-
journal_data %>%
filter(age.in.months %in% c(12, 60, 84, 120, 180)) %>%
group_by(age.in.months, da_factor) %>%
count(is.referenced.by.count, .drop = FALSE) %>%
summarize(mean_citations = mean(is.referenced.by.count)) %>%
ggplot(aes(x = da_factor, y = mean_citations, color = factor(age.in.months))) +
geom_point() +
ggtitle(paste0("Observed mean citations\nPlotted for journal ", journals[[j,1]]))
print(actual_citations)
model <- two_term_glmnb(journal_data, journals[[j,1]])
# p <- get_model_data(model = model, type = "pred", terms = c("da_factor",
# "age.in.months[12,60,84,120,180]"))
if(j == 3) { #genomea = 5
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[84,120,144]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[84,120,144]"))
}
else if (j == 9) { ##Mra - 7
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84]"))
}
else if (j == 10 | j == 11 ) { #msphere, msystems 9 years
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,108]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,108]"))
}
else if (j == 12 ) {
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]"))
}
else {
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]]))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]"))
}
ratio_plot <- tibble(da = p$x, predicted = p$predicted, age.in.months = as.numeric(as.character(p$group))) %>%
pivot_wider(names_from = da, values_from = predicted) %>%
mutate(ratio = `2`/`1`) %>%
ggplot(., aes(x = age.in.months, y = ratio)) +
geom_point() +
labs(title = paste0("Ratio of predicted citations for da = Yes to da = No \nPlotted for journal ", journals[[j,1]]),
y = "Ratio number of citations da=Yes/da=No")
print(plot_model)
print(ratio_plot)
simulation <- simulateResiduals(fittedModel = model, plot = F)
# residuals <-
# residuals(simulation) %>% tibble(residual = .) %>%
# ggplot(aes(y = residual, x = seq_along(residual) )) +
# geom_point(size = 1) +
# geom_smooth() +
# ggtitle(paste0("Residuals with geom_smooth line' \nPlotted for journal ", journals[[j,1]]))
# print(residuals)
residuals_hist <-
residuals(simulation) %>% tibble(residual = .) %>%
ggplot(aes(x = residual)) +
geom_histogram(linewidth = 0.25, color = "white", binwidth = 1/100) +
ggtitle(paste0("Residuals Plotted for journal ", journals[[j,1]]))
print(residuals_hist)
plot(simulation, sub = paste0("Simulation plotted for journal ", journals[[j,1]]))
}
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L = G$L,
## : Fitting terminated with step failure - check results carefully
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
## DHARMa:testOutliers with type = binomial may have inflated Type I error rates for integer-valued distributions. To get a more exact result, it is recommended to re-run testOutliers with type = 'bootstrap'. See ?testOutliers for details
j <- 5
journal_data <-
nsd_yes_metadata %>%
filter(journal_abrev == journals[[j,1]]) %>%
mutate(da_factor = factor(da)) %>%
filter(age.in.months <= 120)
actual_citations <-
journal_data %>%
filter(age.in.months %in% c(12, 60, 84, 120)) %>%
group_by(age.in.months, da_factor) %>%
count(is.referenced.by.count, .drop = FALSE) %>%
summarize(mean_citations = mean(is.referenced.by.count)) %>%
ggplot(aes(x = da_factor, y = mean_citations, color = factor(age.in.months))) +
geom_point() +
ggtitle(paste0("Observed mean citations\nPlotted for journal ", journals[[j,1]]))
## `summarise()` has grouped output by 'age.in.months'. You can override using the
## `.groups` argument.
print(actual_citations)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
model <- two_term_glmnb(journal_data, journals[[j,1]])
# assign(journal_abrev[[j,1]], model)
plot_model <- plot_model(model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120]")) + ggtitle(paste0("Predicted counts of 'is.referenced.by.count' \nPlotted for journal ", journals[[j,1]], " for the last 10 years"))
p <- get_model_data(model = model, type = "pred", terms = c("da_factor", "age.in.months[12,60,84,120,180]"))
ratio_plot <- tibble(da = p$x, predicted = p$predicted, age.in.months = as.numeric(as.character(p$group))) %>%
pivot_wider(names_from = da, values_from = predicted) %>%
mutate(ratio = `2`/`1`) %>%
ggplot(., aes(x = age.in.months, y = ratio)) +
geom_point() +
labs(title = paste0("Ratio of predicted citations for da = Yes to da = No \nPlotted for journal ", journals[[j,1]]),
y = "Ratio number of citations da=Yes/da=No")
print(plot_model)
print(ratio_plot)
simulation <- simulateResiduals(fittedModel = model, plot = F)
# residuals(simulation) %>%
# plot(main = paste0("Residuals plotted for journal ", journals[[j,1]]), pch = ".")
residuals(simulation) %>% tibble(residual = .) %>%
ggplot(aes(y = residual, x = seq_along(residual) )) +
geom_point(size = 1) +
geom_smooth() +
ggtitle(paste0("Residuals with geom_smooth line' \nPlotted for journal ", journals[[j,1]], " for the last 10 years"))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
plot(simulation, sub = paste0("Simulation plotted for journal ", journals[[j,1]]))
residuals_hist <-
residuals(simulation) %>% tibble(residual = .) %>%
ggplot(aes(x = residual)) +
geom_histogram(linewidth = 0.25, color = "white", binwidth = 1/100) +
ggtitle(paste0("Residuals Plotted for journal ", journals[[j,1]]))
print(residuals_hist)
plot(simulation, sub = paste0("Simulation plotted for journal ", journals[[j,1]]))